Required packages
Auxilary package with preprocessed data in dataframe.
install_github("mengluchu/globalLUR/globalLUR/globalLUR")
library(globalLUR)
ls("package:globalLUR")
## [1] "Brt_imp" "Brt_LUR" "cforest_LUR"
## [4] "countrywithppm" "create_ring" "ctree_LUR"
## [7] "error_matrix" "join_by_id" "Lasso"
## [10] "Lassoselected" "mechanical" "merge_roads"
## [13] "merged" "mergeraster2file" "plot_error"
## [16] "plot_rsq" "ppm2ug" "RDring_coef"
## [19] "removedips" "rf_imp" "rf_LUR"
## [22] "sampledf" "scatterplot" "subset_grep"
## [25] "univar_rsq" "xgboost_imp" "xgboost_LUR"
If the above is not successeful for MacOS users, the following and a restart of R may be needed
system('defaults write org.R-project.R force.LANG en_US.UTF-8')
Get 3 color pallets.
colorB = brewer.pal(7, "Greens")
colorG = brewer.pal(11, "PiYG")
colorS = brewer.pal(11, "Spectral")
#set.seed(2)
Dataset:
value_mean: annual mean NO2, day_value: annual mean NO2 at daytime, night_value: annual mean NO2 at night time.
1. road_XX_size: road lenght within a buffer with radius “size” of type XX.
2. I_size: Industrial area within a buffer with radius “size”.
3. Tropomi_2018: TROPOMI averaged over Feb 2018 - Jan 2019.
4. temperature_2m_m: monthly mean temperature at 2m height of month “m”.
5. wind_speed_10m_m:monthly mean wind speed at 2m height of month “m”.
6. pop1k/ 3k /5k: population 1, 3, 5 km resolution.
7. Rsp: Surface remote sensing and chemical transport model product.
8. OMI_mean_filt: OMI column density, 2017 annual average.
# add data usethis::use_data()
data("merged")
names(data)
## NULL
data("countrywithppm") # countries with ppm (parts per million)
summary(merged)
## LATITUDE LONGITUDE value_mean OMI_mean_filt
## Min. :-36.92 Min. :-122.463 Min. : 1.559 Min. :4.732e+14
## 1st Qu.: 32.79 1st Qu.: 2.451 1st Qu.:17.262 1st Qu.:2.122e+15
## Median : 41.53 Median : 13.323 Median :26.164 Median :3.292e+15
## Mean : 40.02 Mean : 44.988 Mean :27.905 Mean :4.418e+15
## 3rd Qu.: 48.65 3rd Qu.: 112.552 3rd Qu.:36.883 3rd Qu.:5.405e+15
## Max. : 69.66 Max. : 153.158 Max. :93.885 Max. :1.936e+16
##
## elevation pop1k RSp temperature_2m_1
## Min. :-289.0 Min. : 0 Min. :-1.000 Min. :-24.329
## 1st Qu.: 28.0 1st Qu.: 1512 1st Qu.: 1.393 1st Qu.: -1.639
## Median : 90.0 Median : 3454 Median : 2.852 Median : 2.462
## Mean : 240.7 Mean : 5926 Mean : 3.796 Mean : 2.826
## 3rd Qu.: 280.0 3rd Qu.: 6955 3rd Qu.: 5.391 3rd Qu.: 7.033
## Max. :2905.0 Max. :87147 Max. :27.538 Max. : 27.697
##
## temperature_2m_10 temperature_2m_11 temperature_2m_12 temperature_2m_2
## Min. : 0.07401 Min. :-13.067 Min. :-24.285 Min. :-18.282
## 1st Qu.:11.21953 1st Qu.: 5.208 1st Qu.: 1.772 1st Qu.: 2.869
## Median :13.65798 Median : 7.410 Median : 4.799 Median : 6.143
## Mean :14.42080 Mean : 8.504 Mean : 4.700 Mean : 6.018
## 3rd Qu.:17.09752 3rd Qu.: 12.283 3rd Qu.: 7.733 3rd Qu.: 8.918
## Max. :29.36926 Max. : 27.786 Max. : 26.249 Max. : 28.373
##
## temperature_2m_3 temperature_2m_4 temperature_2m_5 temperature_2m_6
## Min. :-7.370 Min. :-1.022 Min. : 3.573 Min. : 5.45
## 1st Qu.: 7.195 1st Qu.: 8.642 1st Qu.:14.818 1st Qu.:18.68
## Median : 9.235 Median :11.813 Median :16.577 Median :20.93
## Mean : 9.342 Mean :12.800 Mean :17.795 Mean :21.29
## 3rd Qu.:11.455 3rd Qu.:17.392 3rd Qu.:21.876 3rd Qu.:24.49
## Max. :30.274 Max. :31.617 Max. :33.735 Max. :33.63
##
## temperature_2m_7 temperature_2m_8 temperature_2m_9 wind_speed_10m_1
## Min. : 5.981 Min. : 6.031 Min. : 6.686 Min. : 1.599
## 1st Qu.:18.824 1st Qu.:18.337 1st Qu.:13.690 1st Qu.: 2.784
## Median :22.893 Median :21.328 Median :17.125 Median : 3.448
## Mean :23.032 Mean :22.257 Mean :18.295 Mean : 3.568
## 3rd Qu.:27.729 3rd Qu.:26.946 3rd Qu.:23.065 3rd Qu.: 3.933
## Max. :34.505 Max. :33.713 Max. :30.042 Max. :10.171
##
## wind_speed_10m_10 wind_speed_10m_11 wind_speed_10m_12 wind_speed_10m_2
## Min. : 1.592 Min. : 1.538 Min. : 1.388 Min. :1.695
## 1st Qu.: 2.737 1st Qu.: 2.820 1st Qu.: 2.864 1st Qu.:2.950
## Median : 3.451 Median : 3.504 Median : 3.704 Median :3.639
## Mean : 3.751 Mean : 3.676 Mean : 4.032 Mean :3.871
## 3rd Qu.: 4.329 3rd Qu.: 4.168 3rd Qu.: 5.023 3rd Qu.:4.490
## Max. :10.660 Max. :10.380 Max. :11.064 Max. :9.831
##
## wind_speed_10m_3 wind_speed_10m_4 wind_speed_10m_5 wind_speed_10m_6
## Min. :1.672 Min. :1.825 Min. :1.581 Min. :1.518
## 1st Qu.:2.921 1st Qu.:2.998 1st Qu.:2.724 1st Qu.:2.612
## Median :3.480 Median :3.347 Median :3.201 Median :3.146
## Mean :3.674 Mean :3.521 Mean :3.329 Mean :3.306
## 3rd Qu.:4.273 3rd Qu.:3.894 3rd Qu.:3.772 3rd Qu.:3.746
## Max. :8.556 Max. :8.196 Max. :6.705 Max. :7.569
##
## wind_speed_10m_7 wind_speed_10m_8 wind_speed_10m_9 pop5k
## Min. :1.163 Min. :1.330 Min. :1.425 Min. : 0
## 1st Qu.:2.662 1st Qu.:2.481 1st Qu.:2.468 1st Qu.: 69519
## Median :3.153 Median :2.816 Median :2.954 Median : 173910
## Mean :3.250 Mean :2.995 Mean :3.157 Mean : 293613
## 3rd Qu.:3.662 3rd Qu.:3.231 3rd Qu.:3.655 3rd Qu.: 345032
## Max. :7.276 Max. :6.844 Max. :8.407 Max. :3766416
##
## pop3k country ROAD_1_25 ROAD_1_50
## Min. : 0 CN :1296 Min. : -1.000 Min. : -1.000
## 1st Qu.: 32664 FR : 636 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 79828 ES : 380 Median : 0.000 Median : 0.000
## Mean : 131287 DE : 338 Mean : 2.744 Mean : 6.902
## 3rd Qu.: 163222 GB : 131 3rd Qu.: 0.000 3rd Qu.: 0.000
## Max. :1860921 AT : 124 Max. :300.650 Max. :603.000
## (Other): 731
## ROAD_1_100 ROAD_1_300 ROAD_1_500 ROAD_1_800
## Min. : -1.00 Min. : -1.0 Min. : -1.0 Min. : -1
## 1st Qu.: 0.00 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0
## Median : 0.00 Median : 0.0 Median : 0.0 Median : 0
## Mean : 23.17 Mean : 175.1 Mean : 481.5 Mean : 1207
## 3rd Qu.: 0.00 3rd Qu.: 0.0 3rd Qu.: 0.0 3rd Qu.: 2010
## Max. :1287.63 Max. :7517.8 Max. :13376.8 Max. :20041
##
## ROAD_1_1000 ROAD_1_3000 ROAD_1_5000 ROAD_2_25
## Min. : -1 Min. : -1 Min. : -1 Min. : -1.000
## 1st Qu.: 0 1st Qu.: 4380 1st Qu.: 22177 1st Qu.: 0.000
## Median : 0 Median : 14296 Median : 39887 Median : 0.000
## Mean : 1928 Mean : 17633 Mean : 46610 Mean : 5.394
## 3rd Qu.: 3524 3rd Qu.: 25798 3rd Qu.: 63770 3rd Qu.: 0.000
## Max. :25727 Max. :112511 Max. :273482 Max. :330.800
##
## ROAD_2_50 ROAD_2_100 ROAD_2_300 ROAD_2_500
## Min. : -1.00 Min. : -1.00 Min. : -1.0 Min. : -1.0
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 0.00 Median : 0.00 Median : 0.0 Median : 0.0
## Mean : 13.75 Mean : 44.43 Mean : 294.9 Mean : 798.3
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 483.3 3rd Qu.:1500.8
## Max. :559.69 Max. :1318.85 Max. :4161.3 Max. :6372.3
##
## ROAD_2_800 ROAD_2_1000 ROAD_2_3000 ROAD_2_5000
## Min. : -1 Min. : -1 Min. : -1 Min. : -1
## 1st Qu.: 0 1st Qu.: 0 1st Qu.: 7012 1st Qu.: 17286
## Median : 1146 Median : 2061 Median : 16005 Median : 35261
## Mean : 1898 Mean : 2852 Mean : 19887 Mean : 44942
## 3rd Qu.: 3156 3rd Qu.: 4397 3rd Qu.: 28344 3rd Qu.: 62093
## Max. :13732 Max. :20693 Max. :107719 Max. :251141
##
## ROAD_3_25 ROAD_3_50 ROAD_3_100 ROAD_3_300
## Min. : -1.0 Min. : -1.00 Min. : -1.00 Min. : -1.0
## 1st Qu.: 0.0 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.0
## Median : 0.0 Median : 0.00 Median : 0.00 Median : 0.0
## Mean : 5.8 Mean : 14.57 Mean : 49.69 Mean : 390.8
## 3rd Qu.: 0.0 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 653.7
## Max. :259.6 Max. :447.69 Max. :973.28 Max. :3767.1
##
## ROAD_3_500 ROAD_3_800 ROAD_3_1000 ROAD_3_3000
## Min. : -1.0 Min. : -1.00 Min. : -1.0 Min. : -1
## 1st Qu.: 0.0 1st Qu.: 61.29 1st Qu.: 883.8 1st Qu.: 10603
## Median : 770.1 Median : 1987.41 Median : 3187.6 Median : 21942
## Mean :1066.5 Mean : 2540.52 Mean : 3840.6 Mean : 25476
## 3rd Qu.:1786.0 3rd Qu.: 3890.31 3rd Qu.: 5769.2 3rd Qu.: 36173
## Max. :9253.2 Max. :15623.34 Max. :20488.8 Max. :117425
##
## ROAD_3_5000 ROAD_4_25 ROAD_4_50 ROAD_4_100
## Min. : -1 Min. : -1.000 Min. : -1.00 Min. : -1.00
## 1st Qu.: 25440 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 46484 Median : 0.000 Median : 0.00 Median : 0.00
## Mean : 55963 Mean : 4.595 Mean : 11.89 Mean : 43.48
## 3rd Qu.: 76207 3rd Qu.: 0.000 3rd Qu.: 0.00 3rd Qu.: 1.84
## Max. :290257 Max. :295.920 Max. :608.00 Max. :931.12
##
## ROAD_4_300 ROAD_4_500 ROAD_4_800 ROAD_4_1000
## Min. : -1.0 Min. : -1.0 Min. : -1.0 Min. : -1
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 785.2 1st Qu.: 1421
## Median : 166.6 Median : 850.1 Median : 2104.2 Median : 3352
## Mean : 392.3 Mean :1085.7 Mean : 2662.0 Mean : 4052
## 3rd Qu.: 630.6 3rd Qu.:1718.6 3rd Qu.: 3914.7 3rd Qu.: 5898
## Max. :3559.8 Max. :8337.4 Max. :19735.0 Max. :28916
##
## ROAD_4_3000 ROAD_4_5000 ROAD_5_25 ROAD_5_50
## Min. : -1 Min. : -1 Min. : -1.0 Min. : -1.00
## 1st Qu.: 13866 1st Qu.: 33964 1st Qu.: 0.0 1st Qu.: 0.00
## Median : 24213 Median : 54814 Median : 0.0 Median : 0.00
## Mean : 27911 Mean : 64167 Mean : 16.1 Mean : 42.13
## 3rd Qu.: 37575 3rd Qu.: 84278 3rd Qu.: 25.0 3rd Qu.: 75.83
## Max. :167554 Max. :318870 Max. :186.8 Max. :405.45
##
## ROAD_5_100 ROAD_5_300 ROAD_5_500 ROAD_5_800
## Min. : -1.00 Min. : -1.0 Min. : -1.0 Min. : -1
## 1st Qu.: 0.00 1st Qu.: 98.5 1st Qu.: 712.1 1st Qu.: 2073
## Median : 83.55 Median : 1178.4 Median : 3444.5 Median : 8441
## Mean : 162.68 Mean : 1521.5 Mean : 4241.5 Mean :10184
## 3rd Qu.: 282.86 3rd Qu.: 2517.5 3rd Qu.: 6888.9 3rd Qu.:16234
## Max. :1361.67 Max. :10474.6 Max. :27457.3 Max. :62956
##
## ROAD_5_1000 ROAD_5_3000 ROAD_5_5000 I_1_25
## Min. : -1 Min. : -1 Min. : -1 Min. : -1.0
## 1st Qu.: 3451 1st Qu.: 24529 1st Qu.: 54316 1st Qu.: 0.0
## Median :12659 Median : 72484 Median : 155987 Median : 0.0
## Mean :15243 Mean : 96543 Mean : 212826 Mean : 107.4
## 3rd Qu.:24161 3rd Qu.:151711 3rd Qu.: 312072 3rd Qu.: 0.0
## Max. :93332 Max. :554192 Max. :1496360 Max. :3125.0
##
## I_1_50 I_1_100 I_1_300 I_1_500
## Min. : -1.0 Min. : -1 Min. : -1 Min. : -1
## 1st Qu.: 0.0 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0
## Median : 0.0 Median : 0 Median : 0 Median : 0
## Mean : 280.4 Mean : 1071 Mean : 10390 Mean : 31411
## 3rd Qu.: 0.0 3rd Qu.: 0 3rd Qu.: 0 3rd Qu.: 1875
## Max. :8125.0 Max. :30625 Max. :275625 Max. :785625
##
## I_1_800 I_1_1000 I_1_3000 I_1_5000
## Min. : -1 Min. : -1 Min. : -1 Min. : -1
## 1st Qu.: 0 1st Qu.: 0 1st Qu.: 8125 1st Qu.: 349375
## Median : 0 Median : 0 Median : 624375 Median : 2037500
## Mean : 87671 Mean : 143734 Mean : 1325656 Mean : 3140991
## 3rd Qu.: 65000 3rd Qu.: 143281 3rd Qu.: 1948906 3rd Qu.: 4803750
## Max. :2005625 Max. :2968750 Max. :14392500 Max. :30615000
##
## Tropomi_2018 day_value night_value ID
## Min. : 82.75 Min. : 1.883 Min. : 0.4307 Min. : 1.0
## 1st Qu.: 263.42 1st Qu.:18.353 1st Qu.: 15.4550 1st Qu.: 909.8
## Median : 391.67 Median :28.433 Median : 22.3804 Median :1818.5
## Mean : 481.66 Mean :30.912 Mean : 23.5377 Mean :1818.5
## 3rd Qu.: 648.00 3rd Qu.:41.900 3rd Qu.: 29.6922 3rd Qu.:2727.2
## Max. :1441.78 Max. :94.266 Max. :100.5855 Max. :3636.0
## NA's :5 NA's :1
datatable(merged, rownames = FALSE, filter = "top", options = list(pageLength = 5, scrollX = T))
Fill missing data with NA
merged_1 = na_if(merged, -1)
Merge roads of road type 3, 4, 5, the road length of these road types are aggregated. The original road types are substituted (with keep =T, they are remained).
merged_mr = merge_roads(merged_1, c(3, 4, 5), keep = F) # keep = T keeps the original roads.
names(merged_mr)
## [1] "ROAD_M345_25" "ROAD_M345_50" "ROAD_M345_100"
## [4] "ROAD_M345_300" "ROAD_M345_500" "ROAD_M345_800"
## [7] "ROAD_M345_1000" "ROAD_M345_3000" "ROAD_M345_5000"
## [10] "LATITUDE" "LONGITUDE" "value_mean"
## [13] "OMI_mean_filt" "elevation" "pop1k"
## [16] "RSp" "temperature_2m_1" "temperature_2m_10"
## [19] "temperature_2m_11" "temperature_2m_12" "temperature_2m_2"
## [22] "temperature_2m_3" "temperature_2m_4" "temperature_2m_5"
## [25] "temperature_2m_6" "temperature_2m_7" "temperature_2m_8"
## [28] "temperature_2m_9" "wind_speed_10m_1" "wind_speed_10m_10"
## [31] "wind_speed_10m_11" "wind_speed_10m_12" "wind_speed_10m_2"
## [34] "wind_speed_10m_3" "wind_speed_10m_4" "wind_speed_10m_5"
## [37] "wind_speed_10m_6" "wind_speed_10m_7" "wind_speed_10m_8"
## [40] "wind_speed_10m_9" "pop5k" "pop3k"
## [43] "country" "ROAD_1_25" "ROAD_1_50"
## [46] "ROAD_1_100" "ROAD_1_300" "ROAD_1_500"
## [49] "ROAD_1_800" "ROAD_1_1000" "ROAD_1_3000"
## [52] "ROAD_1_5000" "ROAD_2_25" "ROAD_2_50"
## [55] "ROAD_2_100" "ROAD_2_300" "ROAD_2_500"
## [58] "ROAD_2_800" "ROAD_2_1000" "ROAD_2_3000"
## [61] "ROAD_2_5000" "I_1_25" "I_1_50"
## [64] "I_1_100" "I_1_300" "I_1_500"
## [67] "I_1_800" "I_1_1000" "I_1_3000"
## [70] "I_1_5000" "Tropomi_2018" "day_value"
## [73] "night_value" "ID"
#numeric country
#inde_var$country=as.numeric(inde_var$country)
Visualize with tmap: convenient
locations_sf = st_as_sf(merged_mr, coords = c("LONGITUDE","LATITUDE"))
osm_valuemean = tm_shape(locations_sf) +
tm_dots( "value_mean", col = "value_mean", size = 0.05,
popup.vars = c("value_mean", "day_value", "night_value", "ROAD_2_100", "ROAD_2_5000")) + tm_view(basemaps = c('OpenStreetMap'))
#+tm_shape(lnd)+tm_lines()
tmap_save(osm_valuemean, "NO2mean.html")
Visualize with leaflet: more control. show Day/night ratio, red: day/night >1, blue, day/nigh <1
merged_fp = merged_mr %>% mutate(ratiodn = day_value/night_value) %>% mutate(color = ifelse(ratiodn >1, "red", "blue"))
m = leaflet(merged_fp) %>%
addTiles() %>% addCircleMarkers(radius = ~value_mean/5, color = ~color, popup = ~as.character(value_mean),fill = FALSE) %>% addProviderTiles(providers$Esri.NatGeoWorldMap) %>% addMouseCoordinates() %>%
addHomeButton(ext = extent(116.2, 117, 39.7, 40), layer.name = "Beijing") %>% addHomeButton(ext = extent(5, 5.2, 52, 52.2), layer.name = "Utrecht")
saveWidget(m, file = "NO2daynight.html")
Boxplot
countryname = paste(merged_mr$country, countrycode(merged_mr$country, 'iso2c', 'country.name'), sep = ":")
#tag country with ppm
countryname_s_e=ifelse( merged_mr$country %in% countrywithppm[countrywithppm %in% merged_mr$country], paste(countryname,"*", sep = ""), countryname)
merged_mr$countryfullname = countryname_s_e
# use the median for colour
mergedmedian = merged_mr %>% group_by(country) %>% mutate(median = median(value_mean, na.rm = TRUE))
bp2 <- ggplot(mergedmedian, aes(x=countryfullname, y=value_mean, group=country)) +
labs(x = "Country", y = expression(paste("NO"[2], " ", mu, "g/", "m"^3)), cex = 1.5) +
geom_boxplot(aes(fill = median)) +
theme(text = element_text(size = 13), axis.text.x = element_text(angle = 90, hjust = 1)) + scale_fill_distiller(palette = "Spectral")
# scale_color_brewer(direction = 1)
print(bp2 + ylim(0, 100))
Plot the paired correlation, for road predictors, population, Tropomi. For DE, CN, and world
merged_mr %>% na.omit %>% filter(country == "DE") %>% dplyr::select(matches("_value|ROAD|pop|Trop")) %>% cor %>% corrplot(type = "upper", method = "pie", tl.cex = 0.7)
merged_mr %>% na.omit %>% filter(country == "CN") %>% dplyr::select(matches("_value|ROAD|pop|Trop")) %>% cor %>% corrplot(type = "upper", method = "pie", tl.cex = 0.7)
merged_mr %>% na.omit %>% dplyr::select(matches("_value|ROAD|pop|Trop")) %>% cor %>% corrplot(type = "upper", method = "pie", tl.cex = 0.7)
Spatial dependency
grd_sp <- as_Spatial(locations_sf)
dt.vgm = variogram(value_mean~1, grd_sp)
plot(dt.vgm)
dt.vgm = variogram(value_mean~1, grd_sp, cutoff = 10)
plot(dt.vgm)
countryvariogram = function(COUN, cutoff){
loca = locations_sf%>%filter(country == COUN)
grd_sp <- as_Spatial(loca)
dt.vgm = variogram(value_mean~1, grd_sp, cutoff = cutoff)
plot(dt.vgm)
}
countryvariogram("DE", 1)
countryvariogram("US", 1)
countryvariogram("CN", 1) # reason?
#Moran I test
#install.packages("ape", dependencies = TRUE)
#library(ape)
#merged_mrf = merged_mr%>%filter(country == "US")
#no2.dists <- as.matrix(dist(cbind(merged_mrf$LONGITUDE, merged_mrf$LATITUDE)))
#no2.dists[1:5, 1:5]
#no2.inv <- 1/no2.dists
#diag(no2.inv) <- 0
#no2.inv[1:5, 1:5]
#Moran.I(merged_mrf$value_mean, na.rm = T, no2.inv)
Separate the dataset into training and test dataset with a fraction (her 80% of the records are used for training, the rest for testing), “DE” is the two digit for germany. If for world, the sampling uses the fraction per country.
#merged = merge(merged, stat[,-which(names(stat)%in%c("LATITUTE", "LONGITUDE"))], by = "ID", all.x = T)
response_predictor = globalLUR::sampledf(merged_mr, fraction = 0.8, country2digit = "DE", grepstring_rm = "ID|LATITUDE|LONGITUDE|countryfullname")
#Retrieve test, training, and all variables.
test = response_predictor$test
training = response_predictor$training
inde_var = response_predictor$inde_var
inde_var = inde_var %>% dplyr::select(-country)
The size of test and training dataset
length(test)
## [1] 67
length(training)
## [1] 267
The paired correlation between dependent (mean, day, night) and independent variables. How much information does R-squared tell you?
#Checkt uni-variant R square. Caculate the r-sq for day, night and mean, and bind the columns to form a dataframe for plotting.
rsqmean = inde_var %>% dplyr::select(-matches("value_mean|day_|night_")) %>% univar_rsq (inde_var$value_mean)
rsqday = inde_var %>% dplyr::select(-matches("value_mean|day_|night_")) %>% univar_rsq (inde_var$day_value)
rsqnight = inde_var %>% dplyr::select(-matches("value_mean|day_|night_")) %>% univar_rsq (inde_var$night_value)
rsqdf = cbind(rsqmean, rsqday, rsqnight, rownames(rsqmean))
names(rsqdf)= c("mean","day","night","vars")
plot_rsq(rsqdf = rsqdf, varname = "vars",xlab = "predictors", ylab = "R-squared")
#How does it compare to the vairable importance estimated from LASSO, RF, SGB, XGB, etc.
The scatter plots between predictors and responses, mean
inde_var %>% dplyr::select(matches("ROAD_M345_3000|pop3k|ROAD_2_50$|temperature_2m_7|day_value")) %>% scatterplot(y_name = "day_value", fittingmethod = "loess")
inde_var %>% dplyr::select(matches("Tro|OMI|Rsp|day_value")) %>% scatterplot(y_name = "day_value", fittingmethod = "loess")
# can choose any variable to look at the scatterplot
#inde_var %>% dplyr::select(matches("ROAD_1|day_value")) %>% scatterplot(y_name = "day_value", fittingmethod = "gam")
#inde_var %>% dplyr::select(matches("ROAD_2|day_value")) %>% scatterplot(y_name = "day_value", fittingmethod = "loess")
#scatterplot(inde_var, "night_value", "gam")
#scatterplot(inde_var,"value_mean", "gam" )
Extra: 5) Separate urban/rural hirachical/ two-step linear regression 6) mixed effects regression
If simply using linear regression, the mean, day, night. Predictors are population, temperature, wind speed, GEOM product, OMI tropo column, elevation, and road buffers.
i.e. ROAD|population|value_mean|temperature|wind|GEOM product|OMI|elevation.
Note population is not always significant, though the individual R square for each buffer is high. The prediction for night is much better than for the day
inde_var_train = subset_grep(inde_var[training, ], "ROAD|pop|temp|wind|Rsp|OMI|eleva|coast|I_1|Tropomi|value_mean")
The tree and prediction error will be different if shuffeling training and testing data.
for (i in 2:5)
{
set.seed(i)
testtree = globalLUR::sampledf(merged_mr,fraction = 0.8, "DE" )
with (testtree, ctree_LUR(inde_var, y_varname= c("day_value"), training = training, test = test, grepstring ="ROAD|pop|temp|wind|Rsp|OMI|eleva|coast|I_1|Tropomi" ))
}
Creates diverse set of trees because 1) trees are instable w.r.t. changes in learning/training data (bagging) 2) randomly preselect mtry splitting variables in each split
model training and parameter tuning
#caret
names(getModelInfo())
## [1] "ada" "AdaBag" "AdaBoost.M1"
## [4] "adaboost" "amdai" "ANFIS"
## [7] "avNNet" "awnb" "awtan"
## [10] "bag" "bagEarth" "bagEarthGCV"
## [13] "bagFDA" "bagFDAGCV" "bam"
## [16] "bartMachine" "bayesglm" "binda"
## [19] "blackboost" "blasso" "blassoAveraged"
## [22] "bridge" "brnn" "BstLm"
## [25] "bstSm" "bstTree" "C5.0"
## [28] "C5.0Cost" "C5.0Rules" "C5.0Tree"
## [31] "cforest" "chaid" "CSimca"
## [34] "ctree" "ctree2" "cubist"
## [37] "dda" "deepboost" "DENFIS"
## [40] "dnn" "dwdLinear" "dwdPoly"
## [43] "dwdRadial" "earth" "elm"
## [46] "enet" "evtree" "extraTrees"
## [49] "fda" "FH.GBML" "FIR.DM"
## [52] "foba" "FRBCS.CHI" "FRBCS.W"
## [55] "FS.HGD" "gam" "gamboost"
## [58] "gamLoess" "gamSpline" "gaussprLinear"
## [61] "gaussprPoly" "gaussprRadial" "gbm_h2o"
## [64] "gbm" "gcvEarth" "GFS.FR.MOGUL"
## [67] "GFS.LT.RS" "GFS.THRIFT" "glm.nb"
## [70] "glm" "glmboost" "glmnet_h2o"
## [73] "glmnet" "glmStepAIC" "gpls"
## [76] "hda" "hdda" "hdrda"
## [79] "HYFIS" "icr" "J48"
## [82] "JRip" "kernelpls" "kknn"
## [85] "knn" "krlsPoly" "krlsRadial"
## [88] "lars" "lars2" "lasso"
## [91] "lda" "lda2" "leapBackward"
## [94] "leapForward" "leapSeq" "Linda"
## [97] "lm" "lmStepAIC" "LMT"
## [100] "loclda" "logicBag" "LogitBoost"
## [103] "logreg" "lssvmLinear" "lssvmPoly"
## [106] "lssvmRadial" "lvq" "M5"
## [109] "M5Rules" "manb" "mda"
## [112] "Mlda" "mlp" "mlpKerasDecay"
## [115] "mlpKerasDecayCost" "mlpKerasDropout" "mlpKerasDropoutCost"
## [118] "mlpML" "mlpSGD" "mlpWeightDecay"
## [121] "mlpWeightDecayML" "monmlp" "msaenet"
## [124] "multinom" "mxnet" "mxnetAdam"
## [127] "naive_bayes" "nb" "nbDiscrete"
## [130] "nbSearch" "neuralnet" "nnet"
## [133] "nnls" "nodeHarvest" "null"
## [136] "OneR" "ordinalNet" "ordinalRF"
## [139] "ORFlog" "ORFpls" "ORFridge"
## [142] "ORFsvm" "ownn" "pam"
## [145] "parRF" "PART" "partDSA"
## [148] "pcaNNet" "pcr" "pda"
## [151] "pda2" "penalized" "PenalizedLDA"
## [154] "plr" "pls" "plsRglm"
## [157] "polr" "ppr" "PRIM"
## [160] "protoclass" "qda" "QdaCov"
## [163] "qrf" "qrnn" "randomGLM"
## [166] "ranger" "rbf" "rbfDDA"
## [169] "Rborist" "rda" "regLogistic"
## [172] "relaxo" "rf" "rFerns"
## [175] "RFlda" "rfRules" "ridge"
## [178] "rlda" "rlm" "rmda"
## [181] "rocc" "rotationForest" "rotationForestCp"
## [184] "rpart" "rpart1SE" "rpart2"
## [187] "rpartCost" "rpartScore" "rqlasso"
## [190] "rqnc" "RRF" "RRFglobal"
## [193] "rrlda" "RSimca" "rvmLinear"
## [196] "rvmPoly" "rvmRadial" "SBC"
## [199] "sda" "sdwd" "simpls"
## [202] "SLAVE" "slda" "smda"
## [205] "snn" "sparseLDA" "spikeslab"
## [208] "spls" "stepLDA" "stepQDA"
## [211] "superpc" "svmBoundrangeString" "svmExpoString"
## [214] "svmLinear" "svmLinear2" "svmLinear3"
## [217] "svmLinearWeights" "svmLinearWeights2" "svmPoly"
## [220] "svmRadial" "svmRadialCost" "svmRadialSigma"
## [223] "svmRadialWeights" "svmSpectrumString" "tan"
## [226] "tanSearch" "treebag" "vbmpRadial"
## [229] "vglmAdjCat" "vglmContRatio" "vglmCumulative"
## [232] "widekernelpls" "WM" "wsrf"
## [235] "xgbDART" "xgbLinear" "xgbTree"
## [238] "xyf"
inde_var_train = subset_grep(inde_var[training, ], "ROAD|pop|temp|wind|Rsp|OMI|eleva|coast|I_1|Tropomi|value_mean")
Training RF using Caret
Mtry
model_rf = train(value_mean ~ ., data = inde_var_train, method='rf') # mtry
plot(model_rf)
fitted <- predict(model_rf, inde_var[test, ])
error_matrix(prediction = fitted, validation = inde_var[test, ]$value_mean)
model_gbm = train(value_mean ~ ., data = inde_var[training, ], method='gbm')
plot(model_gbm)
#gbm.step optimal number of trees.
#install.packages("caretEnsemble")
library(caretEnsemble)
# Stacking Algorithms - Run multiple algos in one call.
trainControl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 2,
savePredictions = TRUE,
classProbs = TRUE)
algorithmList <- c('rf', 'adaboost', 'earth', 'xgbDART', 'svmRadial')
set.seed(100)
models <- caretList(value_mean ~ ., data = inde_var_train, trControl = trainControl, methodList = algorithmList)
results <- resamples(models)
summary(results)
set.seed(2)
vip::list_metrics()
## Metric Description Task
## 1 auc Area under (ROC) curve Binary classification
## 2 error Misclassification error Binary classification
## 3 logloss Log loss Binary classification
## 4 mauc Multiclass area under (ROC) curve Multiclass classification
## 5 mlogloss Multiclass log loss Multiclass classification
## 6 mse Mean squared error Regression
## 7 r2 R squared Regression
## 8 rsquared R squared Regression
## 9 rmse Root mean squared error Regression
pre_mat = subset_grep(inde_var_train, grepstring = "ROAD|pop|value_mean|temp|wind|eleva|coast|I_1|Trop")
rf = ranger(value_mean~ ., data = pre_mat, mtry = 33, num.trees = 2000,importance = "permutation")
rf
## Ranger result
##
## Call:
## ranger(value_mean ~ ., data = pre_mat, mtry = 33, num.trees = 2000, importance = "permutation")
##
## Type: Regression
## Number of trees: 2000
## Sample size: 267
## Number of independent variables: 65
## Mtry: 33
## Target node size: 5
## Variable importance mode: permutation
## Splitrule: variance
## OOB prediction error (MSE): 55.38431
## R squared (OOB): 0.6551465
# ranger method
importance(rf)
## ROAD_M345_25 ROAD_M345_50 ROAD_M345_100 ROAD_M345_300
## 1.5937397995 0.3869103843 1.0238120394 4.0543400979
## ROAD_M345_500 ROAD_M345_800 ROAD_M345_1000 ROAD_M345_3000
## 0.6517413989 0.3489697790 0.8390844666 34.9617743923
## ROAD_M345_5000 elevation pop1k temperature_2m_1
## 6.4888193292 0.5530545048 5.4438423606 0.1180595682
## temperature_2m_10 temperature_2m_11 temperature_2m_12 temperature_2m_2
## 0.8166281186 0.2851219354 0.2536844795 0.2151382204
## temperature_2m_3 temperature_2m_4 temperature_2m_5 temperature_2m_6
## 0.0393546928 0.0561927641 0.0358757873 0.2495222502
## temperature_2m_7 temperature_2m_8 temperature_2m_9 wind_speed_10m_1
## -0.1067887579 -0.0298316086 0.1039563733 -0.0634048175
## wind_speed_10m_10 wind_speed_10m_11 wind_speed_10m_12 wind_speed_10m_2
## 1.4353714203 0.2690264802 0.1028837866 0.9881879592
## wind_speed_10m_3 wind_speed_10m_4 wind_speed_10m_5 wind_speed_10m_6
## 0.7102671931 0.3061532104 0.1843722335 0.8492074342
## wind_speed_10m_7 wind_speed_10m_8 wind_speed_10m_9 pop5k
## 0.5071510735 1.7267515495 1.8329691498 5.2327781093
## pop3k ROAD_1_25 ROAD_1_50 ROAD_1_100
## 17.0247538280 0.0044800733 0.0246943849 0.0595378317
## ROAD_1_300 ROAD_1_500 ROAD_1_800 ROAD_1_1000
## 0.0584749800 0.3037501036 0.1112666838 0.0757891949
## ROAD_1_3000 ROAD_1_5000 ROAD_2_25 ROAD_2_50
## 1.2379497813 5.0620814121 12.1620790590 30.9688035370
## ROAD_2_100 ROAD_2_300 ROAD_2_500 ROAD_2_800
## 11.8755215392 2.0319057664 0.9026689173 0.4299128973
## ROAD_2_1000 ROAD_2_3000 ROAD_2_5000 I_1_25
## -0.0136316676 1.1113183194 1.0525460847 -0.0022763541
## I_1_50 I_1_100 I_1_300 I_1_500
## 0.0189187343 -0.0009304509 0.0500863673 0.1097465448
## I_1_800 I_1_1000 I_1_3000 I_1_5000
## 0.1519352991 0.0647437891 -0.0350714075 0.7847547256
## Tropomi_2018
## 1.2565969636
#vip
DF_P_r2 = vi(rf, method = "permute", target = "value_mean", metric = "r2" ) # very clear what method is used and what is the target
DF_P_rmse = vi(rf, method = "permute", target = "value_mean", metric = "rmse")
vip (DF_P_rmse)
vip (DF_P_r2)
partial dependence plots: all the variables. (using sparklines takes a while)
library(DT)
library(sparkline)
a=add_sparklines(DF, fit = rf)
library(htmlwidgets)
saveWidget(a, file="sparkline.html")
Partial dependence plot of selected variables
pre_mat_s = inde_var_train %>% select(value_mean, ROAD_2_50, pop3k, ROAD_M345_300)
lm_s = lm(value_mean~., data = pre_mat_s)
rf_s = ranger(value_mean~ ., data = pre_mat_s, num.trees = 2000, importance = "permutation")
rf_s
## Ranger result
##
## Call:
## ranger(value_mean ~ ., data = pre_mat_s, num.trees = 2000, importance = "permutation")
##
## Type: Regression
## Number of trees: 2000
## Sample size: 267
## Number of independent variables: 3
## Mtry: 1
## Target node size: 5
## Variable importance mode: permutation
## Splitrule: variance
## OOB prediction error (MSE): 65.25373
## R squared (OOB): 0.593694
correlation
pre_mat_predictor = pre_mat_s%>%select(-value_mean)
ggpairs(pre_mat_predictor)
p_lm = partial(lm_s, "ROAD_M345_300",plot = TRUE, rug = TRUE)
plot(p_lm)
p2 = partial(rf_s, "ROAD_M345_300",plot = TRUE, rug = TRUE)
plot(p2)
#slow
pd <- partial(rf_s, pred.var = c("pop3k", "ROAD_M345_300" ))
# Default PDP
pd1 = plotPartial(pd)
# Add contour lines and use a different color palette
rwb <- colorRampPalette(c("red", "white", "blue"))
pdp2 = plotPartial(pd, contour = TRUE, col.regions = rwb)
# 3-D surface
#pdp3 <- plotPartial(pd, levelplot = F, zlab = "ROAD_1_50", colorkey = T,
# screen = list(z = -20, x = -60) )
p3 = partial(rf_s, "ROAD_2_50", plot = TRUE, rug = TRUE)
p1 = partial(rf_s, "pop3k", plot = TRUE, rug = TRUE)
grid.arrange(p1, p2, p3, pd1, pdp2, ncol = 2)
pre_mat = subset_grep(inde_var_train, grepstring = "ROAD|pop|value_mean|temp|wind|eleva|coast|I_1|Trop")
gbm1 = gbm(formula = value_mean~., data = pre_mat, distribution = "gaussian", n.trees = 2000,
interaction.depth = 6, shrinkage = 0.01, bag.fraction = 0.5 )
names(pre_mat)
## [1] "ROAD_M345_25" "ROAD_M345_50" "ROAD_M345_100"
## [4] "ROAD_M345_300" "ROAD_M345_500" "ROAD_M345_800"
## [7] "ROAD_M345_1000" "ROAD_M345_3000" "ROAD_M345_5000"
## [10] "value_mean" "elevation" "pop1k"
## [13] "temperature_2m_1" "temperature_2m_10" "temperature_2m_11"
## [16] "temperature_2m_12" "temperature_2m_2" "temperature_2m_3"
## [19] "temperature_2m_4" "temperature_2m_5" "temperature_2m_6"
## [22] "temperature_2m_7" "temperature_2m_8" "temperature_2m_9"
## [25] "wind_speed_10m_1" "wind_speed_10m_10" "wind_speed_10m_11"
## [28] "wind_speed_10m_12" "wind_speed_10m_2" "wind_speed_10m_3"
## [31] "wind_speed_10m_4" "wind_speed_10m_5" "wind_speed_10m_6"
## [34] "wind_speed_10m_7" "wind_speed_10m_8" "wind_speed_10m_9"
## [37] "pop5k" "pop3k" "ROAD_1_25"
## [40] "ROAD_1_50" "ROAD_1_100" "ROAD_1_300"
## [43] "ROAD_1_500" "ROAD_1_800" "ROAD_1_1000"
## [46] "ROAD_1_3000" "ROAD_1_5000" "ROAD_2_25"
## [49] "ROAD_2_50" "ROAD_2_100" "ROAD_2_300"
## [52] "ROAD_2_500" "ROAD_2_800" "ROAD_2_1000"
## [55] "ROAD_2_3000" "ROAD_2_5000" "I_1_25"
## [58] "I_1_50" "I_1_100" "I_1_300"
## [61] "I_1_500" "I_1_800" "I_1_1000"
## [64] "I_1_3000" "I_1_5000" "Tropomi_2018"
summary(gbm1)
## var rel.inf
## ROAD_M345_3000 ROAD_M345_3000 18.635109805
## ROAD_2_50 ROAD_2_50 7.245938393
## pop3k pop3k 6.647881497
## ROAD_2_100 ROAD_2_100 5.528536677
## ROAD_1_5000 ROAD_1_5000 4.286134230
## pop1k pop1k 3.862034030
## ROAD_2_25 ROAD_2_25 3.198065949
## ROAD_M345_5000 ROAD_M345_5000 3.147119762
## ROAD_M345_1000 ROAD_M345_1000 2.653005426
## ROAD_M345_300 ROAD_M345_300 2.637273199
## pop5k pop5k 2.573345217
## ROAD_M345_100 ROAD_M345_100 2.279573049
## Tropomi_2018 Tropomi_2018 2.113698640
## ROAD_M345_500 ROAD_M345_500 1.968656395
## ROAD_1_3000 ROAD_1_3000 1.959604922
## ROAD_M345_25 ROAD_M345_25 1.795781607
## elevation elevation 1.541735961
## ROAD_2_5000 ROAD_2_5000 1.500913379
## ROAD_2_3000 ROAD_2_3000 1.437845215
## ROAD_2_1000 ROAD_2_1000 1.430160424
## wind_speed_10m_7 wind_speed_10m_7 1.419919919
## ROAD_M345_800 ROAD_M345_800 1.398825439
## ROAD_2_300 ROAD_2_300 1.233401403
## temperature_2m_6 temperature_2m_6 1.231989243
## I_1_3000 I_1_3000 1.159601408
## I_1_1000 I_1_1000 1.086322996
## ROAD_2_800 ROAD_2_800 1.063737889
## ROAD_2_500 ROAD_2_500 0.919873722
## I_1_5000 I_1_5000 0.897750484
## wind_speed_10m_1 wind_speed_10m_1 0.875728094
## ROAD_M345_50 ROAD_M345_50 0.760418968
## wind_speed_10m_9 wind_speed_10m_9 0.681294119
## ROAD_1_500 ROAD_1_500 0.667697713
## temperature_2m_4 temperature_2m_4 0.665306251
## I_1_800 I_1_800 0.626670237
## temperature_2m_8 temperature_2m_8 0.616876998
## wind_speed_10m_11 wind_speed_10m_11 0.573138320
## temperature_2m_2 temperature_2m_2 0.565620301
## wind_speed_10m_3 wind_speed_10m_3 0.543444198
## wind_speed_10m_4 wind_speed_10m_4 0.529655104
## wind_speed_10m_8 wind_speed_10m_8 0.525294128
## wind_speed_10m_10 wind_speed_10m_10 0.515074744
## I_1_500 I_1_500 0.480113722
## temperature_2m_7 temperature_2m_7 0.432547878
## temperature_2m_1 temperature_2m_1 0.430331582
## temperature_2m_9 temperature_2m_9 0.426148696
## temperature_2m_3 temperature_2m_3 0.383272647
## temperature_2m_5 temperature_2m_5 0.329065001
## wind_speed_10m_5 wind_speed_10m_5 0.325021929
## wind_speed_10m_12 wind_speed_10m_12 0.316548711
## ROAD_1_800 ROAD_1_800 0.283225900
## temperature_2m_12 temperature_2m_12 0.270442849
## ROAD_1_1000 ROAD_1_1000 0.253241626
## wind_speed_10m_2 wind_speed_10m_2 0.240541771
## temperature_2m_10 temperature_2m_10 0.229540226
## ROAD_1_300 ROAD_1_300 0.208372015
## wind_speed_10m_6 wind_speed_10m_6 0.187093811
## I_1_300 I_1_300 0.107942047
## temperature_2m_11 temperature_2m_11 0.082347925
## I_1_50 I_1_50 0.007411669
## I_1_100 I_1_100 0.006734537
## ROAD_1_25 ROAD_1_25 0.000000000
## ROAD_1_50 ROAD_1_50 0.000000000
## ROAD_1_100 ROAD_1_100 0.000000000
## I_1_25 I_1_25 0.000000000
plot(gbm1, i.var = 2:3)
#plot(gbm1, i.var = 1)
#rf_residual <- pre_rf - rdf_test$NO2
Xgboost
Tunning XGBoost is more complex (as it has a lot more hyperparameters to tune): https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
gamma[default=0][range: (0,Inf)] It controls regularization (or prevents overfitting). The optimal value of gamma depends on the data set and other parameter values. Higher the value, higher the regularization. Regularization means penalizing large coefficients which don’t improve the model’s performance. default = 0 means no regularization. Tune trick: Start with 0 and check CV error rate. If you see train error >>> test error, bring gamma into action.
lambda and Alpha
nrounds[default=100] It controls the maximum number of iterations. For classification, it is similar to the number of trees to grow. Should be tuned using CV
eta[default=0.3][range: (0,1)] It controls the learning rate, i.e., the rate at which our model learns patterns in data. After every round, it shrinks the feature weights to reach the best optimum. Lower eta leads to slower computation. It must be supported by increase in nrounds. Typically, it lies between 0.01 - 0.3
max_depth[default=6][range: (0,Inf)] It controls the depth of the tree. Larger data sets require deep trees to learn the rules from data. Should be tuned using CV
y_varname= "day_value"
varstring = "ROAD|pop|temp|wind|RSp|OMI|eleva|coast|I_1|Tropo"
prenres = paste(y_varname, "|", varstring, sep = "")
sub_mat = subset_grep(inde_var, prenres)
x_train = sub_mat[training, ]
y_train = sub_mat[training, y_varname]
x_test = sub_mat[test, ]
y_test = sub_mat[test, y_varname]
df_train = data.table(x_train, keep.rownames = F)
df_test = data.table(x_test, keep.rownames = F)
formu = as.formula(paste(y_varname, "~.", sep = ""))
dfmatrix_train = sparse.model.matrix(formu, data = df_train)[, -1]
dfmatrix_test = sparse.model.matrix(formu, data = df_test)[, -1]
train_b = xgb.DMatrix(data = dfmatrix_train, label = y_train)
test_b = xgb.DMatrix(data = dfmatrix_test, label = y_test)
params <- list(booster = "gbtree",max_depth = 4,
eta = 0.05,
nthread = 2,
nrounds = 1000,
Gamma = 2)
#xgb_t = xgb.train (params = params, data = train_b, nrounds = 500, watchlist = list(val=test_b, train = train_b), print_every_n = 10, early_stopping_rounds = 50, maximize = F , eval_metric = "rmse")
#outputvec = inde_var[training, y_varname]
max_depth = 4
eta = 0.01
nthread = 4
nrounds = 1000
Gamma = 2
#simplest: tunning of rounds
xgbcv <- xgb.cv( data = train_b, nfold = 10, nround = nrounds, eta = eta, nthread = nthread, Gamma = Gamma,showsd = T, stratified = T, print_every_n = 200, early_stopping_rounds = 50, maximize = F)
## [1] train-rmse:29.275966+0.273696 test-rmse:29.259226+2.462071
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 50 rounds.
##
## [201] train-rmse:5.620102+0.070028 test-rmse:11.048028+1.523232
## [401] train-rmse:1.485326+0.060579 test-rmse:10.108446+1.349093
## Stopping. Best iteration:
## [459] train-rmse:1.100368+0.058864 test-rmse:10.085322+1.326256
xgbcv
## ##### xgb.cv 10-folds
## iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
## 1 29.2759661 0.27369602 29.25923 2.462071
## 2 29.0138133 0.27126430 29.01890 2.452536
## 3 28.7542561 0.26892306 28.78637 2.442605
## 4 28.4973000 0.26669302 28.54876 2.432511
## 5 28.2430079 0.26435966 28.32009 2.417639
## ---
## 505 0.8870495 0.05612478 10.09810 1.322524
## 506 0.8831940 0.05621285 10.09850 1.322775
## 507 0.8793501 0.05608361 10.09852 1.322913
## 508 0.8754825 0.05594716 10.09839 1.323030
## 509 0.8714984 0.05592954 10.09831 1.322678
## Best iteration:
## iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
## 459 1.100368 0.05886373 10.08532 1.326256
str(xgbcv)
## List of 9
## $ call : language xgb.cv(data = train_b, nrounds = nrounds, nfold = 10, showsd = T, stratified = T, print_every_n = 200, early| __truncated__ ...
## $ params :List of 4
## ..$ eta : num 0.01
## ..$ nthread: num 4
## ..$ Gamma : num 2
## ..$ silent : num 1
## $ callbacks :List of 3
## ..$ cb.print.evaluation:function (env = parent.frame())
## .. ..- attr(*, "call")= language cb.print.evaluation(period = print_every_n, showsd = showsd)
## .. ..- attr(*, "name")= chr "cb.print.evaluation"
## ..$ cb.evaluation.log :function (env = parent.frame(), finalize = FALSE)
## .. ..- attr(*, "call")= language cb.evaluation.log()
## .. ..- attr(*, "name")= chr "cb.evaluation.log"
## ..$ cb.early.stop :function (env = parent.frame(), finalize = FALSE)
## .. ..- attr(*, "call")= language cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize, verbose = verbose)
## .. ..- attr(*, "name")= chr "cb.early.stop"
## $ evaluation_log :Classes 'data.table' and 'data.frame': 509 obs. of 5 variables:
## ..$ iter : num [1:509] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ train_rmse_mean: num [1:509] 29.3 29 28.8 28.5 28.2 ...
## ..$ train_rmse_std : num [1:509] 0.274 0.271 0.269 0.267 0.264 ...
## ..$ test_rmse_mean : num [1:509] 29.3 29 28.8 28.5 28.3 ...
## ..$ test_rmse_std : num [1:509] 2.46 2.45 2.44 2.43 2.42 ...
## ..- attr(*, ".internal.selfref")=<externalptr>
## $ niter : int 509
## $ nfeatures : int 67
## $ folds :List of 10
## ..$ : int [1:26] 99 49 148 193 65 257 16 10 27 169 ...
## ..$ : int [1:26] 95 259 46 137 167 183 110 207 7 155 ...
## ..$ : int [1:26] 103 54 147 214 15 154 29 125 129 243 ...
## ..$ : int [1:26] 108 22 237 98 198 251 165 72 12 78 ...
## ..$ : int [1:26] 190 63 11 256 140 17 56 94 187 168 ...
## ..$ : int [1:26] 206 132 241 133 246 64 231 77 116 156 ...
## ..$ : int [1:26] 176 218 123 255 113 220 26 199 90 138 ...
## ..$ : int [1:26] 192 83 224 182 75 44 180 85 55 57 ...
## ..$ : int [1:26] 233 178 24 53 185 96 235 82 158 191 ...
## ..$ : int [1:33] 86 238 81 170 248 157 260 144 88 102 ...
## $ best_iteration : int 459
## $ best_ntreelimit: num 459
## - attr(*, "class")= chr "xgb.cv.synchronous"
bst <- xgboost(data = train_b, max_depth = max_depth, eta = eta, nthread = nthread, nrounds = xgbcv$best_iteration, Gamma = Gamma, verbose = 1, print_every_n = 200, early_stopping_rounds = 50, maximize = F )
## [1] train-rmse:29.277185
## Will train until train_rmse hasn't improved in 50 rounds.
##
## [201] train-rmse:6.472103
## [401] train-rmse:3.211355
## [459] train-rmse:2.870294
xgbpre = predict(bst, test_b)
error_matrix(y_test, xgbpre)
## RMSE MAE IQR
## 11.688346 8.401553 11.580907
varstring = "ROAD|pop|temp|wind|RSp|OMI|eleva|coast|I_1|Tropo"
xgboost_LUR(inde_var, max_depth =4, eta =0.02, nthread = 2, nrounds = 2000, y_varname= c("day_value"),training = training, test = test, grepstring = varstring )
## Feature Gain Cover Frequency
## 1: ROAD_M345_3000 0.440978815 0.030722455 0.024549348
## 2: ROAD_2_50 0.162633167 0.010285872 0.008963251
## 3: pop1k 0.042820990 0.025945678 0.022856289
## 4: ROAD_1_5000 0.035491080 0.040208195 0.038392590
## 5: ROAD_M345_300 0.030985638 0.036846602 0.044866049
## 6: ROAD_M345_25 0.030677120 0.049502260 0.106314112
## 7: ROAD_M345_5000 0.017652947 0.034927037 0.030524848
## 8: ROAD_2_100 0.015427478 0.035027348 0.021063639
## 9: ROAD_2_300 0.013518111 0.037221001 0.022109352
## 10: pop3k 0.013220215 0.008762371 0.008266109
## 11: wind_speed_10m_10 0.012560588 0.002381560 0.006124888
## 12: ROAD_M345_1000 0.011740863 0.045738010 0.037147694
## 13: ROAD_2_3000 0.011522640 0.047562916 0.039637486
## 14: I_1_1000 0.009936516 0.023094587 0.021113435
## 15: ROAD_2_5000 0.009075646 0.034851215 0.030275869
## RMSE MAE IQR
## 11.644071 8.558915 12.994102
#xgboost_imp (variabledf = inde_var, y_varname = "day_value", max_depth = 5, eta = 0.02, nthread = 4, nrounds = 2000, training = training, test = test, grepstring = varstring )
spatial correlation of errors of random forest
set.seed(2)
pr = globalLUR::sampledf(merged_mr, fraction = 0.8, country2digit = "CN", grepstring_rm = "ID|countryfullname")
inde_var_train = with(pr, inde_var[training, ])
inde_var_test = with(pr, inde_var[test, ])
pre_mat = subset_grep(inde_var_train, grepstring = "ROAD|pop|value_mean|temp|wind|eleva|coast|I_1|Trop")
rf = ranger(value_mean~ ., data = pre_mat, mtry = 33, num.trees = 2000, importance = "permutation")
rf
## Ranger result
##
## Call:
## ranger(value_mean ~ ., data = pre_mat, mtry = 33, num.trees = 2000, importance = "permutation")
##
## Type: Regression
## Number of trees: 2000
## Sample size: 1009
## Number of independent variables: 65
## Mtry: 33
## Target node size: 5
## Variable importance mode: permutation
## Splitrule: variance
## OOB prediction error (MSE): 33.45173
## R squared (OOB): 0.7693244
errordf = with(inde_var_test, data.frame(error = predictions(predict(rf, inde_var_test)) - value_mean, LONGITUDE = LONGITUDE, LATITUDE = LATITUDE))
error_sp = errordf %>% st_as_sf(coords = c("LONGITUDE","LATITUDE")) %>% as_Spatial
dt.vgm = variogram(error~1, error_sp, cutoff = 1)
plot(dt.vgm)
In Sequence, mean, day , night. The predicton errors are much higher than random forest, but used a much simpler model The variables selected are slightly different from each other. The variables selected each time are also different.
Lasso(inde_var, vis1 = T, y_varname = "day_value", training = training, test=test)
## [1] 1.083474
## [1] 2.502967
## RMSE MAE IQR
## 12.750879 9.378781 9.949547
Lasso(inde_var, vis1 = T, y_varname = "night_value", training = training, test=test)
## [1] 0.2901371
## [1] 1.410821
## RMSE MAE IQR
## 6.572639 5.415657 8.888713